options(scipen = 999)
#library(plyr)
library(tidyverse)
library(tidyselect)
library(urbnthemes)
library(sf)
library(tigris)
library(areal)
set_urbn_defaults(style = "print")
We use the 2020 Precinct-Level Election Results from the the Voting and Election Science Team at University of Florida and Wichita State University, accessed via the Harvard Dataverse (data here), for state-by-state shapefiles of 2020 precinct-level general election results.
First, we download the precinct-level shapefiles with election result data.
For more information on how VEST collected election result data and precinct shapefiles, please see their documentation here. Kentucky and New Jersey did not always report results at the precinct level. For these states, VEST apportioned election results from larger geographies to individual precincts based on the average of the vote from the 2016 Presidential election results. Votes for each candidate on the 2020 ballot were distributed from 2020 reporting units to the precincts that comprise those reporting units based on the share of the average 2016 vote from each precinct that was cast for that party’s candidate or for the most ideologically similar minor party candidate. Read more in the documentation linked above.
# Create df for all states except KY and NJ
states <- tribble(
~state, ~statefip, ~id,
"al", "01", 4751074,
"ak", "02", 6550198,
"az", "04", 4864722,
"ar", "05", 4931787,
"ca", "06", 5206371,
"co", "08", 4863166,
"ct", "09", 4986646,
"de", "10", 4773531,
"dc", "11", 4750435,
"fl", "12", 6696063,
"ga", "13", 4863164,
"hi", "15", 4750434,
"id", "16", 4789401,
"il", "17", 4773525,
"in", "18", 5143396,
"ia", "19", 4789403,
"ks", "20", 6696064,
"la", "22", 5739918,
"me", "23", 5739920,
"md", "24", 5111331,
"ma", "25", 5007849,
"mi", "26", 5739924,
"mn", "27", 4499011,
"ms", "28", 5706487,
"mo", "29", 5007850,
"mt", "30", 4773527,
"ne", "31", 5739922,
"nv", "32", 4863168,
"nh", "33", 4499009,
"nm", "35", 5425599,
"ny", "36", 5259468,
"nc", "37", 4863162,
"nd", "38", 5342900,
"oh", "39", 4499012,
"ok", "40", 5790364,
"or", "41", 5194704,
"pa", "42", 5595329,
"ri", "44", 4789406,
"sc", "45", 4789402,
"sd", "46", 6082788,
"tn", "47", 5746909,
"tx", "48", 4931788,
"ut", "49", 4863167,
"vt", "50", 5739919,
"va", "51", 6174181,
"wa", "53", 5007851,
"wv", "54", 6418344,
"wi", "55", 4773528,
"wy", "56", 4789404,
"ky", "21", 6550199,
"nj", "34", 6492876
) %>%
mutate(url =
if_else(
!state %in% c("ky", "nj"),
here::here("05_local-governance", "voter-turnout", "vest", paste0(state, "_2020.zip")),
here::here("05_local-governance", "voter-turnout", "vest", paste0(state, "_2020_vtd_estimates.zip"))
)
)
Download precinct-level election turnout and shapefile data
# Create object for VEST directory, and create the directory if it doesn't exist. "vest" is the acronym for the Voting and Election Science Team at University of Florida and Wichita State
vest_directory <- here::here("05_local-governance", "voter-turnout", "vest")
if (!dir.exists(vest_directory)) {
dir.create(vest_directory)
}
# create a vector of URLs for the state data
urls <- c(
# Create vector of URLs for all states except KY and NJ
paste0("https://dataverse.harvard.edu/api/access/datafile/",
states$id[1:49],
"?format=original&gbrecs=true"),
# Create vector of URLs for KY and NJ
paste0("https://dataverse.harvard.edu/api/access/datafile/",
states$id[50:51],
"?format=original&gbrecs=true")
)
# Create object for VEST data, and download the zip file for each state if it isn't downloaded
vest_data <- c(
# All states except KY and NJ
here::here(paste0("05_local-governance/voter-turnout/vest/",
states$state[1:49],
"_2020.zip")),
# KY and NJ
here::here(paste0("05_local-governance/voter-turnout/vest/",
states$state[50:51],
"_2020_vtd_estimates.zip"))
)
if (!all(file.exists(vest_data))) {
download.file(urls,
destfile = vest_data)
}
rm(urls, vest_directory, vest_data)
Read in the precinct shapefiles and voting results data for each
state. Each data set has several variables that begin with
G20PRE that indicate the number of votes each presidential
candidate received in that precinct. We sum these variables to create a
new variable called total_votes, and only keep this
variable and the geometry. We append all states together and create a
GEOID variable based on row number which we will need for
interpolation later.
read_precinct <- function(file) {
tempfile <- tempfile()
unzip(file, exdir = tempfile)
st_read(tempfile) %>%
dplyr::mutate(total_votes = across(starts_with("G20PRE")) %>% rowSums) %>%
dplyr::select(any_of(c("STATEFP20", "COUNTYFP20", "NAME20", "COUNTY")), total_votes, geometry) %>%
sf::st_transform(crs = 6549)
}
precincts <- map(states$url, read_precinct)
precincts <- precincts %>%
reduce(bind_rows)
# create unique GEOID for interpolation
precincts <- precincts %>%
dplyr::mutate(GEOID20 = row_number())
Load the census place shapefiles from library(tigris)
and filter down to our 486 cities. Because KY and NJ precinct files had
naming conventions that differed from other states, we read in the place
shapefiles for these states separately and then append both files.
read_places <- function(fips) {
tigris::places(state = fips,
cb = TRUE,
year = 2020,
progress_bar = FALSE) %>%
select(GEOID, geometry) %>%
sf::st_transform(crs = 6549)
}
places <- map_df(states$statefip, read_places)
# Read in our 486 places for 2020 and join shapefiles
my_places <- read_csv(here::here("geographic-crosswalks", "data", "place-populations.csv")) %>%
filter(year == 2020) %>%
mutate(GEOID = str_c(state, place)) %>%
left_join(y = places, by = "GEOID") %>%
st_as_sf()
The precinct shapefiles do not all form a closed linestring (i.e.,
the polygons in these shapefiles do not all have identical start and
endpoints). Therefore the geometries are not valid and
aw_interpolate() does not work.
Furthermore, several of the precinct geometries include Z and/or M dimensions. Z-values are most commonly used to represent elevations, but they can also represent other measurements such as annual rainfall or air quality. M-values are used to interpolate distances along linear features, such as roads, streams, and pipelines (two commonly used M-values are milepost distance from a set location, such as county line, and distance from a reference marker). See here for more information about Z and M values.
# # If we try using `aw_interpolate()`, we get the following: "Evaluation error: IllegalArgumentException: Points of LinearRing do not form a closed linestring."
# interpolated <- aw_interpolate(.data = places,
# tid = GEOID,
# source = precincts,
# sid = GEOID20,
# weight = "total",
# output = "sf",
# extensive = "total_votes")
# # If we try to check which observations are invalid, we get the following: "GEOS does not support XYM or XYZM geometries; use st_zm() to drop M"
# test <- st_is_valid(precincts)
We drop Z/M values from precinct geometries and assign flag for observations with Z/M values. There are 40,674 observations this applies to.
# Create a dummy data set with corrected geometries
test <- precincts %>%
st_zm(drop = TRUE)
# Anti-join the corrected geometries to the original precinct data to identify which observations had Z/M values. Create a variable to flag these observations. There are 40,674 observations this applies to.
anti_joined <- anti_join(as.data.frame(precincts), as.data.frame(test), by = "geometry") %>%
mutate(zm_flag = 1) %>%
select(-geometry)
anti_joined %>%
nrow()
## [1] 40674
# Now correct geometries in original precinct data
precincts <- precincts %>%
st_zm(drop = TRUE)
# Join the flags for Z/M values onto our original precinct data and check number of ZM flags
precincts <- left_join(
precincts,
select(anti_joined, GEOID20, zm_flag),
by = "GEOID20"
)
precincts %>%
as.data.frame() %>%
dplyr::count(zm_flag)
## zm_flag n
## 1 1 40674
## 2 NA 137460
# Replace `NA` values for observations that didn't receive a Z/M flag with `0`, then check number of ZM flags
precincts <- precincts %>%
mutate(zm_flag = case_when(is.na(zm_flag) ~ 0,
zm_flag == 1 ~ 1))
precincts %>%
as.data.frame() %>%
dplyr::count(zm_flag)
## zm_flag n
## 1 0 137460
## 2 1 40674
# Remove obsolete data sets
rm(test, anti_joined)
We still cannot interpolate yet because there are still 417 invalid geometries for the following reasons: “Nested shells,” “Ring Self-intersection,” “Self-intersection,” and “Too few points in geometry component.” We identify and flag the observations with invalid geometries, and then validate them to perform the interpolation.
# # If we try using `aw_interpolate()`, we get the following: "Evaluation error: TopologyException: Input geom 0 is invalid: Self-intersection at -4246267.6594204018 982769.49896414403."
# interpolated <- aw_interpolate(.data = places,
# tid = GEOID,
# source = precincts,
# sid = GEOID20,
# weight = "total",
# output = "sf",
# extensive = "total_votes")
# Isolate invalid geometries
invalid_geom <- precincts %>%
filter(!st_is_valid(precincts))
# Check reasons for invalid geometries
st_is_valid(invalid_geom, reason = TRUE) %>%
as.data.frame() %>%
mutate(reason = str_extract(string = st_is_valid(invalid_geom, reason = TRUE),
pattern = ".*\\[")) %>%
dplyr::count(reason)
## reason n
## 1 Hole lies outside shell[ 1
## 2 Nested shells[ 1
## 3 Ring Self-intersection[ 193
## 4 Self-intersection[ 219
## 5 Too few points in geometry component[ 13
# Create a variable to flag these observations
invalid_geom <- invalid_geom %>%
mutate(invalid_flag = 1) %>%
as.data.frame() %>%
select(-geometry, -zm_flag)
# Join the flags for invalid geometries onto our original precinct data and check number of invalid geometries
precincts <- left_join(
precincts,
select(invalid_geom, GEOID20, invalid_flag),
by = "GEOID20",
)
as.data.frame(precincts) %>%
dplyr::count(invalid_flag)
## invalid_flag n
## 1 1 427
## 2 NA 177707
# Replace `NA` values for observations that didn't receive an invalid flag with `0`, then check number of invalid flags
precincts <- precincts %>%
mutate(invalid_flag = case_when(is.na(invalid_flag) ~ 0,
invalid_flag == 1 ~ 1))
as.data.frame(precincts) %>%
dplyr::count(invalid_flag)
## invalid_flag n
## 1 0 177707
## 2 1 427
# Now that we've flagged which geometries are invalid, we can make them valid to proceed with the interpolation
precincts <- st_make_valid(precincts)
# Remove obsolete data sets
rm(invalid_geom)
The number of intersections is a little greater than the number of observations used for the interpolation.
# check overlap
intersections <- st_intersects(precincts, my_places)
sum(lengths(intersections) > 0)
## [1] 59542
The following calculates the proportion of area for each place that is covered by the precincts.
# source: places
# target: precincts
places_weights <- aw_preview_weights(
.data = precincts,
tid = GEOID20,
source = my_places,
sid = GEOID,
type = "extensive"
)
arrange(places_weights, extensiveTotal)
## # A tibble: 486 × 3
## GEOID extensiveSum extensiveTotal
## <chr> <dbl> <dbl>
## 1 5335940 1 0.789
## 2 1224125 1 0.830
## 3 3651000 1.00 0.866
## 4 5363000 1 0.873
## 5 2507000 1 0.888
## 6 5305210 1.00 0.898
## 7 3684000 1 0.909
## 8 2555745 1 0.918
## 9 0657456 1 0.929
## 10 3436000 1 0.930
## # ℹ 476 more rows
Some of the places have less than 100% coverage because the shapes include water and the precinct shapes do not include water. Here are maps of all places with less than 90% coverage.
map_trouble <- function(place_id) {
intersections <- st_intersects(
precincts,
filter(my_places, GEOID == place_id)
)
precincts <- precincts[lengths(intersections) > 0, ]
place <- filter(my_places, GEOID == place_id)
ggplot() +
geom_sf(data = precincts,
alpha = 0.3) +
geom_sf(data = place,
alpha = 0.3, fill = "red") +
labs(
title = paste0(place$place_name, "(", place_id, ")"),
subtitle = "Precincts in blue; places in red"
)
}
map_trouble("5335940")
map_trouble("1224125")
map_trouble("3651000")
map_trouble("5363000")
map_trouble("2507000")
map_trouble("5305210")
We use the following specification for the areal interpolation; however, we manually do the interpolation so we can use the weights to construct a quality variable.
# I also interpolated using the full places list and then filtering to our 486, which got same result.
# Interpolate using only our 486 cities
result <- aw_interpolate(.data = my_places, # target shapes
tid = GEOID, # target id
source = precincts, # source shapes
sid = GEOID20, # source id
weight = "total",
output = "tibble",
extensive = c("total_votes", "zm_flag", "invalid_flag"))
An example of areal interpolation using the following steps is available here.
int <- aw_intersect(
.data = my_places,
source = precincts,
areaVar = "area"
)
tot <- aw_total(
.data = int,
source = precincts,
id = GEOID20,
areaVar = "area",
totalVar = "totalArea",
type = "extensive",
weight = "total"
)
weight <- aw_weight(
.data = tot,
areaVar = "area",
totalVar = "totalArea",
areaWeight = "areaWeight"
)
result <- weight %>%
st_drop_geometry() %>%
mutate(total_votes = total_votes * areaWeight) %>%
group_by(state, place, state_name, place_name, GEOID) %>%
mutate(weight = total_votes / sum(total_votes)) %>%
summarize(
total_votes = sum(total_votes),
messiness = weighted.mean(
x = 0.5 - abs(areaWeight - 0.5),
w = weight
)
) %>%
ungroup()
map_trouble("1380508")
map_trouble("4816432")
map_trouble("0135896")
Compare the VEST precinct-level election returns used in our interpolation to the MIT Election Lab precinct-level returns
# mit <- read_csv(here::here("05_local-governance",
# "voter-turnout",
# "data",
# "mit",
# "precinctpres_2000-2020.csv"))
#
# mit <- mit %>%
# # XX I can't tell if overvotes and undervotes should be counted in the total votes for each precinct.
# # The readme file doesn't tell me and there is nothing to indicate whether they're counted in the VEST data
# filter(!candidate %in% c("OVERVOTES", "UNDERVOTES")) %>%
# group_by(precinct, county_fips) %>%
# summarize(mit_votes = sum(votes))
#
# # XX There is no way to directly compare this to the VEST precinct data because each precinct does not have identical GEOIDs across the two data sets. I was thinking maybe we can look at the distributions?
Interpolate the precinct-level election returns data to the county instead of place. Compare these results to the MIT Election Lab county-level returns.
# Load MIT Election Lab county-level returns. This code is copied directly from `voter-turnout.Rmd` which used the same data
# mit <- read_csv(here::here("05_local-governance",
# "voter-turnout",
# "data",
# "mit",
# "countypres_2000-2020.csv")) %>%
# rename(FIPS = county_fips) %>%
# tidylog::filter(year == 2020) %>%
# mutate(FIPS = if_else(FIPS == "46113", "46102", FIPS)) %>%
# mutate(FIPS = case_when(county_name == "DISTRICT OF COLUMBIA" ~ "11001",
# TRUE ~ FIPS)) %>%
# tidylog::filter(!(state == 'RHODE ISLAND' & county_name == 'FEDERAL PRECINCT')) %>%
# mutate(FIPS = if_else(state == "ALASKA", "02000", FIPS))
#
# mit_counties <- mit %>%
# group_by(year, state, county_name, FIPS) %>%
# summarize(mit_votes = sum(candidatevotes)) %>%
# ungroup() %>%
# mutate(state = str_sub(FIPS, 1, 2),
# county = str_sub(FIPS, 3, 5)) %>%
# select(year,
# state,
# county,
# FIPS,
# mit_votes)
#
# # Now we load a county-level shapefile and join this to the MIT election returns
# counties <- tigris::counties(state = NULL,
# cb = TRUE,
# year = 2020,
# progress_bar = FALSE) %>%
# select(STATEFP, COUNTYFP, GEOID, NAME) %>%
# # Dropping Alaska for ease since we change all counties to 02000 in MIT data
# filter(!STATEFP %in% c("02", "60", "66", "69", "72", "78")) %>%
# rename(FIPS = GEOID,
# state = STATEFP,
# county = COUNTYFP) %>%
# left_join(y = mit_counties, by = c("state", "county", "FIPS"))
#
# # Remove obsolete files
# rm(mit, mit_counties)
#
# # Interpolate to county and check values against MIT values
# st_crs(precincts)
# st_crs(counties)
# counties <- st_transform(counties, crs = 6549)
#
# # # XX When using `aw_interpolate()` below, I got an error: "CBR: result (after common-bits addition) is INVALID: Self-intersection at or near point..." but the function still gave a result...
# # ar_validate(source = precincts,
# # target = counties,
# # varList = c("total_votes", "zm_flag", "invalid_flag"),
# # verbose = TRUE)
#
# test_result <- aw_interpolate(.data = counties,
# tid = FIPS,
# source = precincts,
# sid = GEOID20,
# weight = "total",
# output = "sf",
# extensive = c("total_votes", "zm_flag", "invalid_flag"))
#
# # Calculate percentage difference between county-level MIT votes and VEST votes aggregated from precincts
# # XX I'm not sure how useful this is because the results were not identical at the precinct level to begin with
# test_result <- test_result %>%
# mutate(pct_diff = (abs(total_votes - mit_votes) / ((total_votes + mit_votes)/2)) * 100)
#
# # Kalawao County, HI is missing
# quantile(test_result$pct_diff, na.rm = TRUE)
#
# # XX Maybe identify counties in the top quartile of percentage difference and flag each place in those counties for quality? Will be annoying because places cross county lines
There are a few independent cities and counties in our data set. In these cases, the census place perfectly matches a county and we can pull the county results and compare them to the interpolated place results.
# pull county results
mit <- read_csv(here::here("05_local-governance",
"voter-turnout",
"mit",
"countypres_2000-2020.csv")) %>%
filter(year == 2020) %>%
group_by(county_fips) %>%
summarize(votes = sum(candidatevotes))
# pull crosswalk to make join possible
place_to_county <- read_csv(here::here("geographic-crosswalks", "data", "county-populations.csv")) %>%
filter(year == 2020) %>%
select(state, county, county_name)
# join data
independent_results <- result %>%
left_join(place_to_county, by = c("state", "place_name" = "county_name")) %>%
filter(!is.na(county)) %>%
mutate(county_fips = paste0(state, county)) %>%
left_join(mit, by = "county_fips")
# compare data
independent_results %>%
ggplot(aes(votes, total_votes)) +
geom_abline() +
geom_point(alpha = 0.3) +
labs(
title = "The interpolated results approximately match the reported results",
x = "mit_votes",
y = "vest_votes"
) +
coord_equal() +
scatter_grid()
The denominator for the analysis should be the total age-eligible citizen population (data here)
The US Census Bureau creates a special tabulation of the 2016-2020 ACS that includes county-level estimates of the total number of United States citizens 18 years of age or older. They also report estimates for subgroups within counties and for other geographic areas.
If the data are not downloaded, then we download and unzip the data.
cvap_zip <- here::here("05_local-governance",
"voter-turnout",
"data",
"CVAP_2016-2020_ACS_csv_files.zip")
cvap_data <- here::here("05_local-governance",
"voter-turnout",
"data",
"CVAP_2016-2020_ACS_csv_files")
if (!file.exists(cvap_data)) {
download.file(url = "https://www2.census.gov/programs-surveys/decennial/rdo/datasets/2020/2020-cvap/CVAP_2016-2020_ACS_csv_files.zip",
destfile = cvap_zip)
unzip(zipfile = cvap_zip,
exdir = cvap_data)
file.remove(cvap_zip)
}
Next, we load and clean the data. The variable of interest is
cvap_est.
cvap_est: The rounded estimate of the total number of United States citizens 18 years of age or older for that geographic area and group.
cvap <- read_csv(here::here("05_local-governance",
"voter-turnout",
"data",
"CVAP_2016-2020_ACS_csv_files",
"Place.csv"))
cvap <- cvap %>%
tidylog::filter(lntitle == "Total") %>%
select(-lntitle, -lnnumber)
cvap <- cvap %>%
mutate(state = str_sub(string = geoid, start = 10, end = 11),
place = str_sub(string = geoid, start = -5, end = -1),
GEOID = str_c(state, place))
cvap <- cvap %>%
tidylog::filter(state != "72") %>% #Drop Puerto Rico
select(state,
place,
GEOID,
cvap = cvap_est,
cvap_moe)
We combine the data. The join works for all places.
joined_data <- left_join(result, cvap, by = c("GEOID", "state", "place"))
We calculate turnout and the coefficient of variation for the CVAP estimate.
joined_data <- joined_data %>%
mutate(election_turnout = total_votes / cvap) %>%
mutate(cv = (cvap_moe / 1.645) / cvap)
No observations have voter turnout above 1.
joined_data %>%
ggplot(aes(cvap, total_votes)) +
geom_abline() +
geom_point(alpha = 0.2) +
scatter_grid()
joined_data %>%
filter(total_votes > cvap)
## # A tibble: 0 × 11
## # ℹ 11 variables: state <chr>, place <chr>, state_name <chr>, place_name <chr>,
## # GEOID <chr>, total_votes <dbl>, messiness <dbl>, cvap <dbl>,
## # cvap_moe <dbl>, election_turnout <dbl>, cv <dbl>
Check: Is voter turnout bounded by 0 and 1 inclusive
stopifnot(
max(joined_data$election_turnout, na.rm = TRUE) <= 1
)
stopifnot(
min(joined_data$election_turnout, na.rm = TRUE) >= 0
)
joined_data %>%
ggplot(aes(total_votes, election_turnout)) +
geom_point(alpha = 0.1) +
scale_x_log10() +
scale_y_continuous(limits = c(0, 1),
expand = expansion(mult = c(0, 0.1))) +
scatter_grid() +
labs(title = "There Isn't Much Relationship Between Turnout and Votes")
joined_data %>%
ggplot(aes(cvap, election_turnout)) +
geom_point(alpha = 0.1) +
scale_x_log10() +
scale_y_continuous(limits = c(0, 1),
expand = expansion(mult = c(0, 0.1))) +
scatter_grid() +
labs(title = "There Isn't Much Relationship Between CVAP and Votes")
We consider three dimensions when evaluating the quality of the constructed metric.
Sampling error in the denominator is definitely a concern for small places. We flag cases with high and very high coefficients of variation in the denominator.
1 No issue2 CV >= 0.053 CV >= 0.15There isn’t much consensus on critical values for coefficients of
variation. We use 0.15 because it is mentioned A
Compass for Understanding and Using American Community Survey
Data.
While there is no hard-and-fast rule, for the purposes of this handbook, estimates with CVs of more than 15 percent are considered cause for caution when interpreting patterns in the data.
If anything, a stricter threshold is necessary because the estimates
are used in denominators. Thus, we use 0.05 for a
2.
joined_data <- joined_data %>%
mutate(
denominator_quality = case_when(
is.na(election_turnout) ~ NA_real_,
cv >= 0.15 ~ 3,
cv >= 0.05 ~ 2,
TRUE ~ 1
)
)
The data for Kentucky and New Jersey was flawed because of issues with mail-in ballots during the Covid-19 pandemic.
For the ky_2020_vtd_estimates shapefile the 2020 election results have been further apportioned to individual precincts based on the vote from the 2016 election results for President and for US Senate. The 2016 election results were adjusted where necessary to account for changes in precinct boundaries and modified to account for the change in the number of ballots cast by precinct between November 2016 and November 2020. Votes for each candidate on the 2020 ballot were then distributed from 2020 reporting units to the precincts that comprise those reporting units based on the adjusted share of the 2016 vote from each precinct that was cast for that party’s candidate or for the most ideologically similar candidate.
For the nj_2020_vtd_estimates shapefile the 2020 election results have been further apportioned to individual precincts based on the average of the vote from the 2016 election results for President and the 2018 election results for US Senate. Votes for each candidate on the 2020 ballot were distributed from 2020 reporting units to the precincts that comprise those reporting units based on the share of the average 2016/2018 vote from each precinct that was cast for that party’s candidate or for the most ideologically similar minor party candidate.
Accordingly, we downgrade observations from the states.
joined_data <- joined_data %>%
mutate(
allocation_quality =
if_else(
condition = state %in% c("21", "34"),
true = 3,
false = 1
)
)
The quality of the numerator may be a concern for precincts with Z/M features in their geometry or with an invalid geometry. We already created flags for these cases.
We calculated a messiness measure during the areal interpolation. Precincts that mostly or barely intersect a place are ideal because the assumption of an evenly distributed population matters the least. A precinct that is only half inside a place is concerning.
areaWeight is bounded by zero and one. First, we
calculate 0.5 - abs(areaWeight - 0.5). This gives the
distance from areaWeight to the closer of 0 or
1. We then calculate a weighted average of these values to
calculate messiness.
1 = messiness > 0.052 = messiness > 0.13 = messiness > 0.2joined_data <- joined_data %>%
mutate(
interpolation_quality =
case_when(
messiness < 0.05 ~ 1,
messiness < 0.1 ~ 2,
TRUE ~ 3
)
)
Let’s look at each of the quality variables.
count(
joined_data,
interpolation_quality,
allocation_quality,
denominator_quality
)
## # A tibble: 4 × 4
## interpolation_quality allocation_quality denominator_quality n
## <dbl> <dbl> <dbl> <int>
## 1 1 1 1 408
## 2 1 3 1 8
## 3 2 1 1 42
## 4 3 1 1 28
Let’s use the worse of the three quality tests to set the quality.
joined_data <- joined_data %>%
mutate(election_turnout_quality = pmax(interpolation_quality, allocation_quality, denominator_quality))
joined_data %>%
ggplot(aes(factor(election_turnout_quality), election_turnout)) +
geom_point(alpha = 0.1) +
scale_y_continuous(limits = c(0, 1),
expand = expansion(mult = c(0, 0.1))) +
scatter_grid() +
labs(title = "Very High Turnout is Not Associated with Poor Data Quality")
joined_data %>%
mutate(year = 2020) %>%
select(year, state, place, election_turnout, election_turnout_quality) %>%
write_csv(here::here("05_local-governance",
"voter-turnout",
"voter-turnout-city-2020.csv"))